语言模型数据集¶
Note
我们使用H.G.Well的小说《The Time Machine》作为我们训练语言模型的语料库,它相当小只有30000多个单词。
要把文本转化成模型能使用的数据集,一般需要如下几步:
读取数据
tokenize
建立词汇表
tokens转换成数字索引
创建数据集
读取数据¶
import re
import random
import collections
import torch
#@save
def read_time_machine():
# 读取《The Time Machine》by H. G. Wells
lines = open("../data/timemachine.txt").readlines()
# 非字母都转换成空格、大写字母转小写
return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines if lines]
Tokenize¶
Token是文本的基本组成单元,可以是字符也可以是单词。
#@save
def tokenize(lines, token_type='char'):
# 把每行分裂成一个个字符或是一个个单词
if token_type == 'word':
return [line.split() for line in lines]
elif token_type == 'char':
return [list(line) for line in lines]
else:
print('ERROR: unknown token type: ' + token_type)
建立词汇表¶
#@save
class Vocab:
"""tokens的词汇表"""
def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
if tokens is None:
tokens = []
# 特殊的tokens,如<pad>等
if reserved_tokens is None:
reserved_tokens = []
# 统计各个token的出现次数
counter = collections.Counter([token for line in tokens for token in line])
# 按出现次数排序
self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
reverse=True)
# The index for the unknown token is 0
self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
# 词汇的出现次数需大于等于min_freq
uniq_tokens += [token for token, freq in self.token_freqs
if freq >= min_freq and token not in uniq_tokens]
# 索引如何转token、token如何转索引
self.idx_to_token, self.token_to_idx = [], dict()
for token in uniq_tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
"""tokens转化成索引"""
if not isinstance(tokens, (list, tuple)):
# 可以直接转
return self.token_to_idx.get(tokens, self.unk)
# 递归转
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
"""索引转化成tokens"""
if not isinstance(indices, (list, tuple)):
# 索引不能越界,不然会报错
return self.idx_to_token[indices]
return [self.idx_to_token[index] for index in indices]
Tokens转化成数字索引¶
把上面几步连起来。
#@save
def load_corpus_time_machine():
"""获得timemachine语料库与词汇表"""
# tokenize
tokens = tokenize(read_time_machine())
# 建立词汇表
vocab = Vocab(tokens)
# 转化为List[int]
corpus = [vocab[token] for line in tokens for token in line if vocab[token] != 0]
return corpus, vocab
创建数据集¶
实现一个读取corpus,生成batches的函数。
#@save
class TimeMachineDataLoader:
"""生成timemachine数据集"""
def __init__(self, batch_size, num_steps):
# 读取上一步的结果
self.corpus, self.vocab = load_corpus_time_machine()
# batch_size: 每个batch的样本数
# num_steps: 每个样本的token数,也是索引数
self.batch_size, self.num_steps = batch_size, num_steps
def __iter__(self):
# 加点随机性,从offset开始读
offset = random.randint(0, self.num_steps - 1)
num_tokens = ((len(self.corpus) - offset - 1) // self.batch_size) * self.batch_size
# shape: (batch_size, -1)
# 要预测下一个token,所以要Ys中要+1
Xs = torch.tensor(self.corpus[offset: offset + num_tokens]
).reshape(self.batch_size, -1)
Ys = torch.tensor(self.corpus[offset + 1: offset + 1 + num_tokens]
).reshape(self.batch_size, -1)
# 计算batch数
num_batches = Xs.shape[1] // self.num_steps
for i in range(0, self.num_steps * num_batches, self.num_steps):
# 相应列的内容
X = Xs[:, i: i + self.num_steps]
Y = Ys[:, i: i + self.num_steps]
yield X, Y
合起来¶
#@save
def load_data_time_machine(batch_size, num_steps):
"""读取timemachine数据集和词汇表"""
data_iter = TimeMachineDataLoader(batch_size, num_steps)
return data_iter, data_iter.vocab
# shape is (batch_size, num_steps)
data_iter, vocab = load_data_time_machine(2, 5)
for x, y in data_iter:
print(x)
print(y)
break
tensor([[ 2, 1, 3, 5, 13],
[ 9, 4, 3, 1, 3]])
tensor([[ 1, 3, 5, 13, 2],
[ 4, 3, 1, 3, 9]])